import re
from typing import List

import pandas as pd
from llama_index.core.schema import TextNode

from iflytech_assistant.assistant.dataclasses import RagData
from iflytech_assistant.es import index

INDEX_NAME = "heqc_examples_vote"

df = pd.read_excel(".vscode/上屏发送数据.xlsx")
# remove text in "用户偏好" column except English words and Chinese characters
df["用户偏好"] = df["用户偏好"].apply(
    lambda x: re.sub(r"[^\u4e00-\u9fa5a-zA-Z]", "", x)
)
df["i_clickword"] = df["i_clickword"].apply(
    lambda x: re.sub(r"A、.*$", "", x)
)

df = df[["d_submodeid", "content", "userinput", "i_clickword", "send_word", "用户偏好"]]
df["d_submodeid"] = df["d_submodeid"].apply(lambda x: x.replace("发", ""))


# group by "d_submodeid" and "用户偏好" and userinput

df = (
    df.groupby(["d_submodeid", "用户偏好", "userinput"])
    .agg(lambda x: "\n".join(x))
    .reset_index()
)

nodes: List[TextNode] = []

for i, row in df.iterrows():
    data: RagData = RagData(
        input=row["userinput"],
        target=row["d_submodeid"],
        tag=row["用户偏好"],
        mode="polish",
        examples=row["i_clickword"].split("\n"),
    )
    node: TextNode = data.to_text_node()
    nodes.append(node)


index(nodes, INDEX_NAME)
